################################
## Name: 08_ngrams.R
## Purpose: This script runs the ngram analysis through the bioweapons documents. 
## Data In: 
## 1) bioweapons articles
## Note: this is not available in
## the replication package as it contains
## data we can't share due to copyright
## we've kept the full path on NYU's Greene
## HPC as a reference
## /scratch/olympus/projects/russia_ukraine_war/bioweapons_new/bioweapons_casestudy_5_20_2024_sbert_embeddings.json
## Date Out:
## 1) /scratch/olympus/projects/russia_ukraine_war/bioweapons_public/precision_ngrams_test.rds
## precision coding master file for ngram analysis, includes the following columns
## a) cutoff: 5-word gram cosine similarity between ego and alter article
## b) stratum: binned version of cutoff (either ".2 < .4", ".4 < .6", or ".6+")
## c) ego: article id for ego article
## d) atler: article id for alter article
## e) ego_article: english text for ego article
## f) alter_article: english text for alter article
## g) same_subject: NA, to be filled in by RA
## h) same_claim: NA, to be filled in by RA
## i) coder: label for which the coder should include
## create two additional versions of this datafraem for each RA, 
## removing the "cutoff" and "startum" variables
## /scratch/olympus/projects/russia_ukraine_war/bioweapons_public/precision_ngrams_test_ra2.csv
## /scratch/olympus/projects/russia_ukraine_war/bioweapons_public/precision_ngrams_test_ra1.csv
## 2) /scratch/olympus/projects/russia_ukraine_war/bioweapons_public/ngrams_edges.rds
## includes the list of edges between articles above .2 5-word gram threshold
## a) ego_id
## b) alter_id
## c) sim: 5 word gram cosine similarity
## d) ego_date: date of ego article
## e) alter_date: date of alter article
## f) date range: whether in 5 day date range (all "yes")
## Notes:
## We can't share the file total_art below to replicate this code
## because it includes raw text. We include a derivative file
## that we can share below. 

library(jsonlite)
library(quanteda)
library(igraph)
library(tidyverse)

## This line will fail 
## because we can't share this document
total_art <- jsonlite::stream_in(file("/scratch/olympus/projects/russia_ukraine_war/bioweapons_new/bioweapons_casestudy_5_20_2024_sbert_embeddings.json"))

## code for cosine similarity 
cos_sim <- function(small, big) {
  small%*%t(big)/sqrt(tcrossprod(rowSums(small^2), rowSums(big^2)))
}

## public version of total art
## that we can share
total_art_public <- jsonlite::stream_in(file("data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json"))

##########################
### Prep articles ########
##########################

## cluster content translated
corpus <- corpus(total_art,
                 docid_field = "article_id",
                 text_field = "content_translated")
tokens <- tokens(corpus,
                 remove_url = TRUE,
                 remove_punct = TRUE)
tokens <- tokens_tolower(tokens)

# note: we don't remove numbers 
## because we're measuring exact text matching here


## remove stopwords
tokens <- tokens_select(tokens,
                        pattern = stopwords("en"),
                        selection = "remove")
## stemming
tokens <- tokens_wordstem(tokens)

## 5 grams
tokens <- tokens_ngrams(tokens, n = 5)

## measure between article cosine similarity
dfm <- dfm(tokens)

## saving derivative dfm object
#saveRDS(dfm, "data/ngram_dfm.rds")

##############################
### Replication Code #########
##############################

## from here replicators can 
## run code with derivative dfm
dfm <- readRDS("data/ngram_dfm.rds")

m <- cos_sim(dfm, dfm)

## decompose into graph
net <- graph_from_adjacency_matrix(m, mode = "undirected",
                       weighted = TRUE)

edgelist <- as_edgelist(net)

edgelist <- as.data.frame(edgelist)
edgelist$sim <- E(net)$weight

## limit to pairs that are not self-referents
colnames(edgelist)[c(1,2)] <- c("ego_id", "alter_id")
edgelist <- edgelist %>% filter(ego_id != alter_id)


## limit to pairs within 5 day range
edgelist$ego_date <- as.Date(total_art_public$final_date[match(edgelist$ego_id,
                                                          total_art_public$article_id)])
edgelist$alter_date <- as.Date(total_art_public$final_date[match(edgelist$alter_id,
                                                            total_art_public$article_id)])

edgelist$date_range <- ifelse(edgelist$ego_date >= edgelist$alter_date - days(5) &
                                  edgelist$ego_date <= edgelist$alter_date + days(5),
                                "yes", "no")
edgelist <- edgelist %>%
  filter(date_range == "yes")

mat <- which(colnames(edgelist) == "ego_id" |
               colnames(edgelist) == "alter_id")
edgelist$match_id <- apply(edgelist, 1, function(x){
  x <- x[mat]
  x <- x[order(x)]
  x <- paste0(x, collapse = "_")
  return(x)
})
sum(duplicated(edgelist$match_id)) ## 0


## randomly sampling pairs for validation coding
## stratified by cosine similarity (minimimum .2, maximum .7)
edgelist$stratum <- ifelse(edgelist$sim < .2,
                  "Less than .2",
                  ifelse(edgelist$sim  < .4,
                         ".2 < .4",
                         ifelse(edgelist$sim < .6,
                                ".4 < .6",
                                ".6+")))

## counts edgelist stratum - needed for denominator in precision
## calculations
counts_edgelist <- edgelist %>%
  group_by(stratum) %>%
  summarize(n = n())
table(edgelist$stratum)
## Less than .2 52991 pairs out of 55105
## .2 < .4 1368 pairs out of 55105
## .4 < .6 556 pairs out of 55105
## .6+ 190 pairs out of 55105


set.seed(08540)
samp_1 <- which(edgelist$stratum == ".2 < .4")
samp_1 <- sample(samp_1, 30, replace = FALSE)
samp_2 <- which(edgelist$stratum == ".4 < .6")
samp_2 <- sample(samp_2, 30, replace = FALSE)
samp_3 <- which(edgelist$stratum == ".6+")
samp_3 <- sample(samp_3, 30, replace = FALSE)
samp <- c(samp_1, samp_2, samp_3)


precision_coding <- edgelist[samp, ] ## 90 / 55,105 pairs
  

precision_coding$ego_article <- total_art_public$summary[match(precision_coding$ego_id,
                                                               total_art_public$article_id)]
precision_coding$alter_article <- total_art_public$summary[match(precision_coding$alter_id,
                                                                 total_art_public$article_id)]

precision_coding$same_subject <- NA
precision_coding$same_claim <- NA
precision_coding$refutation <- NA

## take permutation
precision_coding <- precision_coding[sample(1:nrow(precision_coding),
                                            nrow(precision_coding),
                                            replace = FALSE), ]

## merge in denominator data
precision_coding$stratum_denominator_counts <- counts_edgelist$n[
  match(precision_coding$stratum,
        counts_edgelist$stratum)
]


## creating master file
#saveRDS(precision_coding, "data/precision_ngrams_test_updated_10_9_2024.rds")

## RA file
## each file should be coded by three RAs
precision_coding$coder <- c(rep("RA1", 45),
           rep("RA2", 45))


#write.csv(precision_coding[precision_coding$coder == "RA1", c("ego_id",
#                             "alter_id",
#                             "ego_article",
#                          "alter_article",
#                            "same_subject",
#                           "same_claim",
#                            "refutation")], "data/precision_ngrams_test_ra1_updated_10_9_2024.csv")


#write.csv(precision_coding[precision_coding$coder == "RA2", c("ego_id",
#                                                             "alter_id",
#                                                              "ego_article",
#                                                              "alter_article",
#                                                              "same_subject",
#                                                              "same_claim",
#                                                              "refutation")], "data/precision_ngrams_test_ra2_updated_10_9_2024.csv")





#########################################
### Edgelist Pairs ##################


edgelist <- edgelist %>%
  filter(sim > .2)


#saveRDS(edgelist, "data/ngrams_edges.rds")

